#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2023/9/13 17:06
# @Author  : 王凯
# @File    : henan_illegal.py
# @Project : spider-man
import re

from apps.tax_illegal.tax_illegal.items import NetTaxIllegalProItem
from apps.tax_illegal.tax_illegal.spiders import BaseTaxIllegalSpider
from utils.tools import urlencode, parse_url_params


class HeNanIllegalSpider(BaseTaxIllegalSpider):
    name = "henan_illegal"
    province: str = "河南"
    url: str = "https://henan.chinatax.gov.cn/henanchinatax/xxgk/zdsswfsxaj/index.html"  # 取首页

    def start_requests(self):
        yield self.Request("https://henan.chinatax.gov.cn/henanchinatax/xxgk/zdsswfsxaj/index.html")
        yield from self.etax_search(**{"etax_url": "https://etax.henan.chinatax.gov.cn:8443"})

    def parse(self, response, **kwargs):
        form = response.xpath('//form[contains(@class, "leftNavWrap")]')
        pageId = form[0].re_first(r'pageId=(.*?)[&"]')
        moduleId = form[0].re_first(r'moduleId=(.*?)[&"]')
        params = {"pageId": pageId, "currentPage": 1, "moduleId": moduleId}
        url = "https://henan.chinatax.gov.cn/eportal/ui"
        yield self.FormRequest(
            url + "?" + urlencode(params),
            method="POST",
            formdata={"filter_LIKE_main.TITLE": ""},
            callback=self.parse_list,
        )

    def parse_list(self, response, **kwargs):
        yield from self.parse_manuscript(response, **kwargs)
        root_url, params = parse_url_params(response.url)
        page_num = response.xpath("//input[@totalpage]/@totalpage").get()
        if page_num:
            page_num = int(page_num[0])
            for page in range(2, page_num + 1):
                yield self.FormRequest(
                    root_url + "?" + urlencode({**params, "currentPage": page}),
                    method="POST",
                    callback=self.parse_manuscript,
                )

    def parse_manuscript(self, response, **kwargs):
        for a in response.xpath('//a[@class="clickA"]'):
            yield response.follow(a.xpath("./@href").get(), callback=self.parse_detail)

    def parse_detail(self, response, **kwargs):
        item = NetTaxIllegalProItem()
        info = []
        for i in response.xpath('//table[@class="zhongdatable"]/tbody/tr'):
            res = [i.xpath(".//th").xpath("string(.)").get(), i.xpath(".//td").xpath("string(.)").get().strip()]
            if res[0]:
                info.append(res)

        info = dict(info)
        mapping = {
            "纳税人名称": "company_name",
            "纳税人识别号或社会信用代码": "taxpayer_id",
            "组织机构代码": "org_code",
            "注册地址": "address",
            "负有直接责任的中介机构信息": "resp_intermediary",
            "案件性质": "illegal_status",
            "主要违法事实": "illegal_facts",
            "相关法律依据及税务处理处罚情况": "basis_and_punishment",
        }

        for k, v in info.items():
            if mapping.get(k):
                setattr(item, mapping.get(k), re.sub("(<.*?>)", "", re.sub("(<!--.*?-->)", "", v)).strip())

        item.legal_representative = (
            info.get("法定代表人或负责人或法院裁判确定的实际责任人姓名")
            + "，"
            + info.get("法定代表人或负责人或法院裁判确定的实际责任人性别")
            + "，"
            + info.get("法定代表人或负责人或法院裁判确定的实际责任人证件名称")
            + "，"
            + info.get("法定代表人或负责人或法院裁判确定的实际责任人证件号码")
        )

        item.province = self.province
        pub_date = response.xpath('//*[@name="PubDate"]/@content').get()
        item.year = pub_date[:10].replace("-", "")
        yield item


if __name__ == "__main__":

    from scrapy import cmdline

    cmdline.execute(argv=["scrapy", "crawl", "henan_illegal"])
